{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 规范化文本匹配实用函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Utility functions for normalized Unicode string comparison.\n",
    "\n",
    "Using Normal Form C, case sensitive:\n",
    "\n",
    "    >>> s1 = 'café'\n",
    "    >>> s2 = 'cafe\\u0301'\n",
    "    >>> s1 == s2\n",
    "    False\n",
    "    >>> nfc_equal(s1, s2)\n",
    "    True\n",
    "    >>> nfc_equal('A', 'a')\n",
    "    False\n",
    "\n",
    "Using Normal Form C with case folding:\n",
    "\n",
    "    >>> s3 = 'Straße'\n",
    "    >>> s4 = 'strasse'\n",
    "    >>> s3 == s4\n",
    "    False\n",
    "    >>> nfc_equal(s3, s4)\n",
    "    False\n",
    "    >>> fold_equal(s3, s4)\n",
    "    True\n",
    "    >>> fold_equal(s1, s2)\n",
    "    True\n",
    "    >>> fold_equal('A', 'a')\n",
    "    True\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "from unicodedata import normalize\n",
    "\n",
    "def nfc_equal(str1, str2):\n",
    "    return normalize('NFC', str1) == normalize('NFC', str2)\n",
    "\n",
    "def flod_equal(str1, str2):\n",
    "    return (normalize('NFC', str1).casefold() == \n",
    "            normalize('NFC', str2).csefold())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 极端“规范化”：去掉变音符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import unicodedata\n",
    "import string\n",
    "\n",
    "def shave_marks(txt):\n",
    "    \"\"\" 去掉变音字符 \"\"\"\n",
    "    norm_txt = unicodedata.normalize('NFD', txt)\n",
    "    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))\n",
    "    return unicodedata.normalize('NFC', shaved)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'“Herr Voß: • ½ cup of OEtker™ caffe latte • bowl of acai.”'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "order = '“Herr Voß: • ½ cup of OEtker™ caffè latte • bowl of açaí.”'\n",
    "shave_marks(order)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def shave_marks_latin(txt):\n",
    "    \"\"\"把拉丁基字符中所有的变音符号删除\"\"\"\n",
    "    norm_txt = unicodedata.normalize('NFD', txt) # 忽略拉丁基字符上的变音符号\n",
    "    latin_base = False\n",
    "    keepers = []\n",
    "    for c in norm_txt:\n",
    "        if unicodedata.combining(c) and latin_base:\n",
    "            continue\n",
    "        keepers.append(c)\n",
    "        # 如果不是组合字符，那就是新的基字符\n",
    "        if not unicodedata.combining(c):\n",
    "            latin_base = c in string.ascii_letters\n",
    "    shaved  = ''.join(keepers)\n",
    "    return unicodedata.normalize('NFC', shaved)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Zέφupoς, Zefiro'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shave_marks_latin('Zέφupoς, Zéfiro')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 把一些西文印刷字符转换成 ASCII 字符"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "single_map = str.maketrans(\"\"\"‚ƒ„†ˆ‹‘’“”•–—˜›\"\"\",\n",
    "                           \"\"\"'f\"*^<''\"\"---~>\"\"\")\n",
    "multi_map = str.maketrans({\n",
    "    '€': '<euro>',\n",
    "    '…': '...',\n",
    "    'Œ': 'OE',\n",
    "    '™': '(TM)',\n",
    "    'œ': 'oe',\n",
    "    '‰': '<per mille>',\n",
    "    '‡': '**',\n",
    "})\n",
    "multi_map.update(single_map)\n",
    "\n",
    "def dewinize(txt):\n",
    "    \"\"\"Replace Win1252 symbols with ASCII chars or sequences\"\"\"\n",
    "    return txt.translate(multi_map)\n",
    "\n",
    "def asciize(txt):\n",
    "    no_marks = shave_marks_latin(dewinize(txt))\n",
    "    no_marks = no_marks.replace('ß', 'ss')\n",
    "    return unicodedata.normalize('NFKC', no_marks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí.\"'"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'\n",
    "dewinize(order)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai.\"'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "asciize(order)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Unicode 数据库中数值字符的元数据示例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "U+0031\t  1   \tre_dig\tisdig\tisnum\t 1.00\tDIGIT ONE\n",
      "U+00bc\t  ¼   \t-\t-\tisnum\t 0.25\tVULGAR FRACTION ONE QUARTER\n",
      "U+00b2\t  ²   \t-\tisdig\tisnum\t 2.00\tSUPERSCRIPT TWO\n",
      "U+0969\t  ३   \tre_dig\tisdig\tisnum\t 3.00\tDEVANAGARI DIGIT THREE\n",
      "U+136b\t  ፫   \t-\tisdig\tisnum\t 3.00\tETHIOPIC DIGIT THREE\n",
      "U+216b\t  Ⅻ   \t-\t-\tisnum\t12.00\tROMAN NUMERAL TWELVE\n",
      "U+2466\t  ⑦   \t-\tisdig\tisnum\t 7.00\tCIRCLED DIGIT SEVEN\n",
      "U+2480\t  ⒀   \t-\t-\tisnum\t13.00\tPARENTHESIZED NUMBER THIRTEEN\n",
      "U+3285\t  ㊅   \t-\t-\tisnum\t 6.00\tCIRCLED IDEOGRAPH SIX\n"
     ]
    }
   ],
   "source": [
    "import unicodedata\n",
    "import re\n",
    "\n",
    "\n",
    "re_digit = re.compile(r'\\d')\n",
    "sample = '1\\xbc\\xb2\\u0969\\u136b\\u216b\\u2466\\u2480\\u3285'\n",
    "\n",
    "for char in sample:\n",
    "    print('U+%04x' % ord(char),\n",
    "          char.center(6),\n",
    "          're_dig' if re_digit.match(char) else '-',\n",
    "          'isdig' if char.isdigit() else '-',\n",
    "          'isnum' if char.isnumeric() else '-',\n",
    "          format(unicodedata.numeric(char), '5.2f'),\n",
    "          unicodedata.name(char),\n",
    "          sep='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 支持字符串和字节序列的双模式API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Text\n",
      "  'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'\n",
      "Numbers\n",
      "  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']\n",
      "  bytes  : [b'1729', b'1', b'12', b'9', b'10']\n",
      "Words\n",
      "  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']\n",
      "  bytes  : [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "\n",
    "re_numbers_str = re.compile(r'\\d+')\n",
    "re_words_str = re.compile(r'\\w+')\n",
    "re_numbers_bytes = re.compile(rb'\\d+')\n",
    "re_words_bytes = re.compile(rb'\\w+')\n",
    "\n",
    "text_str = (\"Ramanujan saw \\u0be7\\u0bed\\u0be8\\u0bef\"\n",
    "            \" as 1729 = 1³ + 12³ = 9³ + 10³.\")\n",
    "text_bytes = text_str.encode('utf_8')\n",
    "\n",
    "print('Text', repr(text_str), sep='\\n  ')\n",
    "print('Numbers')\n",
    "print('  str  :', re_numbers_str.findall(text_str))\n",
    "print('  bytes  :', re_numbers_bytes.findall(text_bytes))\n",
    "print('Words')\n",
    "print('  str  :', re_words_str.findall(text_str))\n",
    "print('  bytes  :', re_words_bytes.findall(text_bytes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
